### DATA PREPARATION

# Data at sentence-level is aggregated to the report-level for analysis


# Session info
# R version 4.1.0 (2021-05-18)
# Platform: x86_64-apple-darwin17.0 (64-bit)
# Running under: macOS 13.1


# Load packages -----------------------------------------------------------
# if needed install package first:
#install.packages("tidyverse")
library(tidyverse)

# Load data ---------------------------------------------------------------
rm(list = ls())

# load sentence level data
data_sent_level <- read_csv("data/sentence_level.csv")


# Wrangle data from sentence to report level------------------------------------------------------------


### factorise IEG rating variable
data_sent_level$IEG_Outcome <- factor(data_sent_level$IEG_Outcome, 
                                      levels = c("Highly Unsatisfactory", 
                                                 "Unsatisfactory",
                                                 "Moderately Unsatisfactory",
                                                 "Moderately Satisfactory",
                                                 "Satisfactory",
                                                 "Highly Satisfactory"))

# create variable for positive assessment share, neutrals are removed later
# creates a dummy variable for whether a sentence is most probably positive
# based on language model output, as well as generic count variable
data_sent_level <-  data_sent_level %>% 
  mutate(pos = case_when(prediction == "positive" ~ 1,
                         prediction != "positive" ~ 0),
         n = 1)

# create report level data, neutrals removed as they add no
# information for current analysis
data_rep_level <- data_sent_level %>% filter(prediction != "neutral") %>% 
  group_by(id) %>% 
  summarise(IEG_Outcome = first(IEG_Outcome), type = first(type), 
            sentiment = sum(pos)/ n(), year = first(year), 
            country = first(country),
            name = first(name)
  )

# create variable for general satisfaction (positive or negative)
data_rep_level <- data_rep_level %>%
  mutate(general_sat = case_when(IEG_Outcome == "Highly Unsatisfactory" ~ 0,
                                 IEG_Outcome == "Unsatisfactory" ~ 0,
                                 IEG_Outcome == "Moderately Unsatisfactory" ~ 0,
                                 IEG_Outcome == "Moderately Satisfactory" ~ 1,
                                 IEG_Outcome == "Satisfactory" ~ 1,
                                 IEG_Outcome == "Highly Satisfactory" ~ 1),
         
         IEG_Outcome_num = case_when(IEG_Outcome == "Highly Unsatisfactory" ~ 1,
                                     IEG_Outcome == "Unsatisfactory" ~ 2,
                                     IEG_Outcome == "Moderately Unsatisfactory" ~ 3,
                                     IEG_Outcome == "Moderately Satisfactory" ~ 4,
                                     IEG_Outcome == "Satisfactory" ~ 5,
                                     IEG_Outcome == "Highly Satisfactory" ~ 6))


write_csv(data_rep_level, file = "data/data_rep_level.csv")
